knitr::opts_chunk$set(tidy=TRUE, highlight=TRUE, dev="png",
cache=TRUE, highlight=TRUE, autodep=TRUE, warning=FALSE, error=FALSE,
message=FALSE, prompt=TRUE, comment='', fig.cap='')
> library(ggplot2)
> library(dplyr)
> library(tidyr)
> library(biomaRt)
> library(pheatmap)
This is looking at a first sequencing run of single cells. There are three samples, H1, H2 and H3. There is some prior knowledge that the H1 sample might have some quality problems.
For a single-cell experiment, we expect to see a set of barcodes that come from real cells and then a drop. Below we show the barcode distribution for a published experiment on K562 cells and then each of these samples. We look at only the top 100,000 barcodes for each sample.
If you click on the tabs you can see each of the charts. We can see that the samples are missing the characteristic dip in the barcode distribution. Normally we would use this dip to determine a cutoff to filter out real cells.
This may be due to there not being enough depth of sequencing or it may be due to the samples having something wrong with them. It would be helpful to ask the sequencing core what they think about this.
> samples = c("H1", "H2", "H3")
> barcode_files = file.path("umis", samples, "cb-histogram.txt")
> K562_bc = read.table("metadata/k562-histogram.txt", header = FALSE, sep = "\t",
+ nrows = 1e+05)
> colnames(K562_bc) = c("barcode", "count")
> K562_bc$rank = as.numeric(rownames(K562_bc))
> ggplot(K562_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("K562 barcode count distribution")
> K562_small = read.table("metadata/k562-histogram-60million.txt", header = FALSE,
+ sep = "\t", nrows = 1e+05)
> colnames(K562_small) = c("barcode", "count")
> K562_small$rank = as.numeric(rownames(K562_small))
> ggplot(K562_small, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("K562 barcode count distribution")
> h1_bc = read.table(barcode_files[1], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h1_bc) = c("barcode", "count")
> h1_bc$rank = as.numeric(rownames(h1_bc))
> ggplot(h1_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H1 barcode count distribution")
> h2_bc = read.table(barcode_files[2], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h2_bc) = c("barcode", "count")
> h2_bc$rank = as.numeric(rownames(h2_bc))
> ggplot(h2_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H2 barcode count distribution")
> h3_bc = read.table(barcode_files[3], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h3_bc) = c("barcode", "count")
> h3_bc$rank = as.numeric(rownames(h3_bc))
> ggplot(h3_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H3 barcode count distribution")
Here we compare the counts for the K562 data as well as for each of the proximal tubule samples, using several different metrics. We counted at the level of the transcript, so before continuing we will collapse the transcript level counts to gene level counts by summing the counts for each transcript for each gene.
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> conversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id",
+ "mgi_symbol"), mart = mart)
> load_counts = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% tidyr::separate(gene, into = c("transcript", "version"),
+ sep = "\\.", fill = "right") %>% dplyr::left_join(conversions, c(transcript = "ensembl_transcript_id")) %>%
+ dplyr::filter(mgi_symbol != "") %>% dplyr::select(-transcript, -version,
+ -ensembl_gene_id, -refseq_mrna) %>% tidyr::gather(cell, count, -mgi_symbol) %>%
+ dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$mgi_symbol
+ counts$mgi_symbol = NULL
+ return(counts)
+ }
> load_bcbio_refseq = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% dplyr::left_join(conversions, c(gene = "refseq_mrna")) %>%
+ dplyr::filter(mgi_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id,
+ -ensembl_transcript_id) %>% tidyr::gather(cell, count, -mgi_symbol) %>%
+ dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$mgi_symbol
+ counts$mgi_symbol = NULL
+ return(counts)
+ }
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> hconversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id",
+ "hgnc_symbol"), mart = mart)
> load_hcounts = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% dplyr::left_join(hconversions, c(gene = "ensembl_transcript_id")) %>%
+ dplyr::filter(hgnc_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id,
+ -refseq_mrna) %>% tidyr::gather(cell, count, -hgnc_symbol, -hgnc_symbol) %>%
+ dplyr::group_by(cell, hgnc_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$hgnc_symbol
+ counts$hgnc_symbol = NULL
+ return(counts)
+ }
> load_k562 = function(count_file) {
+ k562 = read.table(count_file, header = TRUE, sep = ",") %>% tidyr::separate(gene,
+ into = c("transcript", "symbol"), sep = "\\|") %>% dplyr::select(-transcript) %>%
+ tidyr::gather(cell, count, -symbol) %>% dplyr::group_by(cell, symbol) %>%
+ dplyr::summarise(counts = sum(count)) %>% tidyr::spread(cell, counts,
+ fill = 0)
+ k562 = as.data.frame(k562)
+ rownames(k562) = k562$symbol
+ k562$symbol = NULL
+ k562 = k562[!grepl("ERCC", rownames(k562)), ]
+ return(k562)
+ }
> count_files = file.path("no-ncrna", samples, paste(samples, ".counts", sep = ""))
> h1 = load_counts(count_files[1])
> h2 = load_counts(count_files[2])
> h3 = load_counts(count_files[3])
> count_files = file.path("new-counts", "umis", samples, paste(samples, ".counts",
+ sep = ""))
> h1_refseq = load_bcbio_refseq(count_files[1])
> h2_refseq = load_bcbio_refseq(count_files[2])
> h3_refseq = load_bcbio_refseq(count_files[3])
> k562_grch37 = load_hcounts("metadata/k562-GRCh37.counts")
> k562_refseq = load_k562("metadata/k562-refseq.counts")
We’ll look at just the top 150 cells with the most counts in each sample, so we can compare more easily to the K562 cells.
> top_n_cell = function(counts, n = 150) {
+ counts = counts[, head(order(colSums(counts), decreasing = TRUE), n)]
+ return(counts)
+ }
> k562_refseq = top_n_cell(k562_refseq)
> k562_grch37 = top_n_cell(k562_grch37)
> h1 = top_n_cell(h1)
> h2 = top_n_cell(h2)
> h3 = top_n_cell(h3)
> h1_refseq = top_n_cell(h1_refseq)
> h2_refseq = top_n_cell(h2_refseq)
> h3_refseq = top_n_cell(h3_refseq)
The H2-H3 samples have much less genes detected. This makes sense because the samples were sequenced much less deeply. However we see somewhere around a 10th of the genes detected which is quite a low amount. The distribution of genes detected is similar, but there is a sharp dropoff in genes detected for the H1-H3 samples compared to the K562 sample.
> k562_grch37_detected = data.frame(cell = colnames(k562_grch37), total = colSums(k562_grch37),
+ detected = colSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(cell = colnames(k562_refseq), total = colSums(k562_refseq),
+ detected = colSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(cell = colnames(h1), total = colSums(h1), detected = colSums(h1 >
+ 0), sample = "h1")
> h2_detected = data.frame(cell = colnames(h2), total = colSums(h2), detected = colSums(h2 >
+ 0), sample = "h2")
> h3_detected = data.frame(cell = colnames(h3), total = colSums(h3), detected = colSums(h3 >
+ 0), sample = "h3")
> h1_refseq_detected = data.frame(cell = colnames(h1_refseq), total = colSums(h1_refseq),
+ detected = colSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(cell = colnames(h2_refseq), total = colSums(h2_refseq),
+ detected = colSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(cell = colnames(h3_refseq), total = colSums(h3_refseq),
+ detected = colSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_detected, h3_detected,
+ k562_refseq_detected, h1_detected, h2_detected, h3_detected, h1_refseq_detected,
+ h2_refseq_detected, h3_refseq_detected)
> detected = transform(detected, cell = reorder(cell, -detected))
>
> ggplot(detected, aes(reorder(cell, detected), detected)) + geom_bar(stat = "identity",
+ position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() +
+ scale_x_discrete(breaks = NULL) + xlab("cell") + ylab("genes detected")
For the K562 samples, the RefSeq annotation captures more counts per cell. This isn’t true of the H1-H3 samples, though.
> ggplot(detected, aes(sample, total)) + geom_boxplot() + theme_bw() + ylab("total counts per cell") +
+ xlab("")
We can also try to measure the complexity of the library, looking at how many different genes are detected plotted against the total number of counts. Here we can see that these libraries have a fairly poor distribution of counts when using the Ensembl annotation for H1-H3. Using the RefSeq annotation, it looks much better. The Ensembl annotation has a bunch of non-coding RNA and non-polyA RNA annotated, much more than the RefSeq annotation which could be leading to the particularly poor quality using the Ensembl annotation. As a comparison, the K562 sample, however, looks fine using either the Ensembl or the RefSeq annotation.
We can see from these plots a poor sequence quality in the libraries, the K562 sample has a fairly steep change in genes detected with an increased total counts. The H1-H3 samples on the other hand have a much less steep increase, a piece of evidence that these libraries are less complex than the K562 lbraries. We are comparing across species here, so the comparison isn’t a great one.
> ggplot(detected, aes(total, detected, color = sample)) + geom_point() + scale_x_log10() +
+ facet_wrap(~sample) + theme_bw() + xlab("total counts") + ylab("genes detected")
Here we can see a steep dropoff for number of cells a gene is expressed in the H1-H3 samples, meaning there are fewer genes detected as expressed in every cell. The K562 samples, by comparison, have a much less sharp falloff. This indicates that there might be poor capture of RNA from the cells, leading to a less complex library.
> k562_grch37_detected = data.frame(gene = rownames(k562_grch37), total = rowSums(k562_grch37),
+ detected = rowSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(gene = rownames(k562_refseq), total = rowSums(k562_refseq),
+ detected = rowSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(gene = rownames(h1), total = rowSums(h1), detected = rowSums(h1 >
+ 0), sample = "h1")
> h2_detected = data.frame(gene = rownames(h2), total = rowSums(h2), detected = rowSums(h2 >
+ 0), sample = "h2")
> h3_detected = data.frame(gene = rownames(h3), total = rowSums(h3), detected = rowSums(h3 >
+ 0), sample = "h3")
> h1_refseq_detected = data.frame(gene = rownames(h1_refseq), total = rowSums(h1_refseq),
+ detected = rowSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(gene = rownames(h2_refseq), total = rowSums(h2_refseq),
+ detected = rowSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(gene = rownames(h3_refseq), total = rowSums(h3_refseq),
+ detected = rowSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_detected, h3_detected,
+ k562_refseq_detected, h1_refseq_detected, h2_refseq_detected, h3_refseq_detected)
> detected_df = detected
> detected = transform(detected, gene = reorder(gene, -detected))
> detected = subset(detected, total > 100)
> ggplot(detected, aes(reorder(gene, detected), detected)) + geom_bar(stat = "identity",
+ position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() +
+ scale_x_discrete(breaks = NULL) + xlab("genes") + ylab("cells detected")
Here we take a shot at identifying the most variable genes to do PCA with, it is tough with the samples we have because they are very low cou`nts.
> library(Seurat)
> k562_grch37.dat = k562_grch37
> colnames(k562_grch37.dat) = paste(colnames(k562_grch37.dat), "k562_grch37",
+ sep = "_")
> k562_grch37.seurat = new("seurat", raw.data = log(k562_grch37.dat + 1))
> k562_grch37.seurat = setup(k562_grch37.seurat, project = "k562_grch37", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> k562_grch37.seurat = mean.var.plot(k562_grch37.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h1.dat = h1
> colnames(h1.dat) = paste(colnames(h1.dat), "h1", sep = "_")
> h1.seurat = new("seurat", raw.data = log(h1.dat + 1))
> h1.seurat = setup(h1.seurat, project = "h1", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h1.seurat = mean.var.plot(h1.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h2.dat = h2
> colnames(h2.dat) = paste(colnames(h2.dat), "h2", sep = "_")
> h2.seurat = new("seurat", raw.data = log(h2.dat + 1))
> h2.seurat = setup(h2.seurat, project = "h2", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h2.seurat = mean.var.plot(h2.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h3.dat = h3
> colnames(h3.dat) = paste(colnames(h3.dat), "h3", sep = "_")
> h3.seurat = new("seurat", raw.data = log(h3.dat + 1))
> h3.seurat = setup(h3.seurat, project = "h3", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h3.seurat = mean.var.plot(h3.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h1_refseq.dat = h1_refseq
> colnames(h1_refseq.dat) = paste(colnames(h1_refseq.dat), "h1_refseq", sep = "_")
> h1_refseq.seurat = new("seurat", raw.data = log(h1_refseq.dat + 1))
> h1_refseq.seurat = setup(h1_refseq.seurat, project = "h1_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h1_refseq.seurat = mean.var.plot(h1_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h2_refseq.dat = h2_refseq
> colnames(h2_refseq.dat) = paste(colnames(h2_refseq.dat), "h2_refseq", sep = "_")
> h2_refseq.seurat = new("seurat", raw.data = log(h2_refseq.dat + 1))
> h2_refseq.seurat = setup(h2_refseq.seurat, project = "h2_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h2_refseq.seurat = mean.var.plot(h2_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h3_refseq.dat = h3_refseq
> colnames(h3_refseq.dat) = paste(colnames(h3_refseq.dat), "h3_refseq", sep = "_")
> h3_refseq.seurat = new("seurat", raw.data = log(h3_refseq.dat + 1))
> h3_refseq.seurat = setup(h3_refseq.seurat, project = "h3_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h3_refseq.seurat = mean.var.plot(h3_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> k562_grch37.seurat@var.genes
[1] "ABCE1" "ABHD10" "ACBD4" "ACBD5" "AFAP1-AS1"
[6] "AKT1" "ALS2" "ANGEL2" "AP3M1" "ARMC7"
[11] "ASB7" "ATG2B" "ATG4B" "ATXN1" "ATXN7L1"
[16] "B3GNTL1" "BBX" "C10orf12" "C18orf54" "C19orf12"
[21] "C19orf48" "C2orf88" "CCDC117" "CD53" "CDADC1"
[26] "CDK19" "CEP164" "CETN2" "CETN3" "COL1A2"
[31] "COLGALT2" "CREBL2" "CSTF1" "DCUN1D1" "DDIT3"
[36] "DDX41" "DKC1" "DNAJC27" "E2F1" "EEA1"
[41] "EEF2K" "EFTUD1" "EGLN3" "ELMO1" "ERLIN1"
[46] "EXTL2" "FAM171A1" "FAM222B" "FAM3C" "FBXO11"
[51] "FCGR2A" "FER" "FGD4" "GEMIN5" "GORASP2"
[56] "GPR183" "GPR63" "HBZ" "HCCS" "HDAC4"
[61] "HDHD1" "HIP1" "HIST1H1B" "HMGB3" "HOXB7"
[66] "IGFBP4" "IP6K2" "ISPD-AS1" "JMY" "KIAA1715"
[71] "KLHL12" "KLHL26" "LDLRAD3" "LINC00839" "LYPLA1"
[76] "MED21" "MED9" "MIER1" "MKLN1" "MLEC"
[81] "MORF4L2" "MYADM" "NCK1" "NCOA5" "NEMF"
[86] "NFIL3" "NME3" "NUDT16P1" "NUP62" "ORMDL1"
[91] "PAGE1" "PHOSPHO2" "PIM1" "PPP2R4" "PRNP"
[96] "PTK2B" "PTPN7" "R3HCC1L" "RC3H1" "RNF41"
[101] "RPL32" "SF1" "SGOL2" "SKAP2" "SLC25A34"
[106] "SLC29A3" "SLC38A1" "SLC50A1" "SORCS1" "SOS2"
[111] "SPTLC1" "SSX2IP" "STAT6" "STK38" "STT3A"
[116] "SUV420H1" "TAL2" "THAP2" "TMCC1-AS1" "TMEM14A"
[121] "TMEM158" "TMEM74B" "TP73" "TRAF3" "TRIB3"
[126] "TRIP12" "TRPS1" "UBOX5" "USP2-AS1" "USPL1"
[131] "UTP14A" "VPS36" "WDFY2" "WTAP" "ZCCHC7"
[136] "ZKSCAN8" "ZNF33A" "ZNF33B" "ZNF343" "ZNF35"
[141] "ZNF507" "ZNF695" "ZNF784"
> h1.seurat@var.genes
[1] "Akt1" "Amotl2" "App" "Ascc1" "Atg13" "Atxn10"
[7] "Bmpr1a" "Bms1" "Cdk19" "Cebpb" "Celsr2" "Cflar"
[13] "Clcn4-2" "Creb3l1" "Crtc3" "Cxxc1" "Cyp4a12a" "Dapk1"
[19] "Ddx6" "Dennd6a" "Derl2" "Dnajc9" "Dpm2" "Efr3a"
[25] "Eif2ak1" "Fam114a2" "Fbrsl1" "Fbxl20" "Frs3" "Gab1"
[31] "Gatad2b" "Gng12" "Hbs1l" "Hdac4" "Hnrnpc" "Inpp5f"
[37] "Irf2bp1" "Lrch1" "Map7" "Mapt" "Mpi" "Nr2c2ap"
[43] "Nras" "Nt5dc2" "Nucb1" "Pex11b" "Pign" "Plxna2"
[49] "Prpf18" "Ptpn1" "Rbm4" "Rqcd1" "Sav1" "Sec23b"
[55] "Snx13" "Spin1" "Srsf7" "Stk24" "Syne4" "Sypl2"
[61] "Taok2" "Tbc1d5" "Tbl2" "Tcerg1" "Tcf25" "Tial1"
[67] "Tmem106c" "Tmem135" "Trip10" "Tubgcp2" "Uba1" "Ubl4"
[73] "Ucp2" "Uxs1" "Vps72" "Wbp5" "Zdhhc6" "Zfp131"
[79] "Zfp322a" "Zswim8"
> h2.seurat@var.genes
[1] "Aadat" "Akr1c18" "Akt1" "Anxa5" "Apela"
[6] "Api5" "Aqp4" "Arid2" "Banf1" "Bcl2"
[11] "Cdk12" "Cdkn1a" "Celsr2" "Chsy1" "Cnot1"
[16] "Copg1" "Creb3l1" "Cryab" "Cxxc1" "Cyp4a12a"
[21] "Dapk1" "Dcun1d3" "Dennd6a" "Derl2" "Dmtn"
[26] "Dnajc5" "Dpep1" "Dusp22" "Eif2ak1" "Eif4ebp2"
[31] "Elp3" "Ergic2" "Fam114a2" "Fbxl20" "Fndc3b"
[36] "Gas1" "Gmppa" "Gng12" "Grk6" "Gtpbp1"
[41] "H2-Eb1" "Hp1bp3" "Hsd17b14" "Igf2" "Ilf3"
[46] "Kctd13" "Kdm6b" "Ldlr" "Mapt" "Mettl21b"
[51] "Mrrf" "Nipsnap3b" "Nsf" "Nsun4" "Nudcd1"
[56] "Orai1" "Paip1" "Patl1" "Pcsk7" "Pex11b"
[61] "Pigyl" "Pmepa1" "Ppp2r2d" "Psat1" "Qprt"
[66] "Rqcd1" "Rybp" "Sec23b" "Slc12a2" "Snx15"
[71] "Socs7" "Spidr" "Sptb" "Srsf7" "Tcerg1"
[76] "Tcf25" "Tmem220" "Tmem52b" "Tnrc6b" "Traf6"
[81] "Ttc9c" "Ube2w" "Ube3a" "Ubxn2b" "Ugt2b38"
[86] "Usp6nl" "Zdhhc6"
> h3.seurat@var.genes
[1] "9430016H08Rik" "Ago3" "Akt1" "Aldh1a1"
[5] "Ankib1" "Api5" "BC005624" "Cebpb"
[9] "Celsr2" "Cflar" "Clip4" "Cxxc1"
[13] "Cyp4a31" "Dennd6a" "Derl2" "Dmtn"
[17] "Efr3a" "Egf" "Ephx2" "Fbrsl1"
[21] "Fkbp1a" "Gng12" "Gnpda1" "Hs1bp3"
[25] "Ly6a" "Map7" "Mmd" "Mt1"
[29] "Nono" "Ntan1" "Patl1" "Pnkd"
[33] "Prodh2" "Rbm15b" "Rbm4" "Rnf152"
[37] "Rnf166" "Sec23b" "Shmt2" "Slc12a1"
[41] "Spin1" "Tbc1d10b" "Tcf25" "Tmem52b"
[45] "Tsc2" "Ube2w" "Umod" "Wfdc15b"
[49] "Zdhhc6"
> h1_refseq.seurat@var.genes
[1] "6330416G13Rik" "Abhd5" "Adipor2" "Akt1"
[5] "Apex1" "Asb8" "Ascc1" "Atg13"
[9] "Atp6v1b2" "Bag2" "Bms1" "Capns1"
[13] "Cdk19" "Cdr2" "Crtap" "Ctns"
[17] "Cxxc1" "Cyp4a12a" "Cyp4a12b" "D8Ertd82e"
[21] "Dctn1" "Ddx1" "Ddx47" "Ddx58"
[25] "Dnajc9" "Dpm2" "Efr3a" "Elp2"
[29] "Epm2aip1" "Ercc2" "Fam163a" "Fam89b"
[33] "Fbrsl1" "Fbxl20" "Fitm1" "Fundc2"
[37] "Gtpbp1" "Hsd17b7" "Klf3" "Kras"
[41] "Lage3" "Lpar3" "Map2k7" "Mfsd7c"
[45] "Mospd3" "Mpi" "Mtap" "Nsun4"
[49] "Nt5dc2" "Patl1" "Pbx1" "Pcgf5"
[53] "Pcnxl3" "Pgm2" "Pgpep1" "Prlr"
[57] "Psmg2" "Ptpn1" "Rbbp5" "Rbm4"
[61] "Rexo1" "Rexo2" "Samd1" "Samd10"
[65] "Sft2d3" "Sh3gl2" "Slc25a45" "Smarcc1"
[69] "Snx13" "Sp1" "Stk24" "Stx12"
[73] "Taok2" "Taok3" "Terf2" "Tgfbr3"
[77] "Tmem135" "Tnks1bp1" "Trappc1" "U2af2"
[81] "Ucp2" "Usp47" "Zdhhc6" "Zfp384"
[85] "Zfx" "Zwint"
> h2_refseq.seurat@var.genes
[1] "Aadat" "Akr1c18" "Alg3" "Anxa5" "Apela" "Apex1"
[7] "Arnt2" "Atp6v1b2" "Calcoco1" "Capns1" "Cdca4" "Cdkn2c"
[13] "Chsy1" "Clec2h" "Cln3" "Cnot1" "Creb3l1" "Ctnnd1"
[19] "Cxxc1" "Cyp4a12a" "Dcun1d3" "Ddx58" "Dnajc10" "Dpm2"
[25] "Dync2h1" "Efhd1" "Eif2ak1" "Eif4ebp2" "Elp2" "Evc2"
[31] "Fam21" "Fbxl20" "Fnbp4" "Fndc3b" "Fubp3" "Fundc2"
[37] "Fzr1" "G6pc3" "Gas1" "Gbe1" "Glis1" "Gmppa"
[43] "Got1" "Gtpbp1" "Hdc" "Hoxc10" "Hsd17b14" "Hsd17b7"
[49] "Irf6" "Itgb1" "Itsn1" "Kctd13" "Klf3" "Magt1"
[55] "Max" "Mmab" "Mrrf" "Napsa" "Nfkb1" "Nsf"
[61] "Nsun4" "Olfml1" "Pak1" "Pak1ip1" "Patl1" "Pax2"
[67] "Pbx1" "Pbx2" "Pigyl" "Plbd2" "Pmepa1" "Psat1"
[73] "Rab4b" "Scd1" "Sde2" "Slc12a2" "Slc35a3" "Slc35a5"
[79] "Snx15" "Sp1" "Srsf2" "Stard5" "Taok2" "Tbc1d10b"
[85] "Tet3" "Tmem52b" "Tmem65" "Tnks1bp1" "Trip6" "Ttc9c"
[91] "Ubap2" "Ucp2" "Ugt2b38" "Utrn" "Zdhhc6" "Zwint"
> h3_refseq.seurat@var.genes
[1] "Aadat" "Abcb7" "Anxa4" "Ap1m1" "Apex1"
[6] "Arid2" "Atat1" "Capns1" "Cldn19" "Cxxc1"
[11] "Cyp4a31" "Dctn1" "Derl2" "Dnase1" "Dync2li1"
[16] "Efr3a" "Egf" "Eif1ad" "Eml3" "Epm2aip1"
[21] "Exoc8" "Ezh1" "Fam63a" "Fbrsl1" "Fbxo28"
[26] "Gbas" "Gmppa" "Gstt3" "Hs1bp3" "Igf1r"
[31] "Kap" "Kdm4c" "Kdm6b" "Kif1b" "Kmt2b"
[36] "Max" "Mpv17l" "Mt1" "N4bp1" "Npr2"
[41] "Nt5dc3" "Pak1ip1" "Pnkd" "Ptpn1" "Rbbp5"
[46] "Reep6" "Rexo1" "Rexo2" "Rnf152" "Rsu1"
[51] "Scamp4" "Sdad1" "Serpina1f" "Slc12a1" "Slc51a"
[56] "Snx13" "Sp1" "Srsf10" "Stk24" "Syngr1"
[61] "Tbc1d10b" "Tm9sf1" "Tmem52b" "Tpm4" "U2af2"
[66] "Ucp2" "Vps4a" "Wfdc15b" "Xpot" "Zcchc3"
[71] "Zdhhc6" "Zfp358" "Zfp963"
Just to see, even though the variable genes identified looks wonky, in these samples, we ran PCA on all of the samples to see if any groups pop out.
> k562_grch37.seurat = pca(k562_grch37.seurat, pcs.print = 3, genes.print = 5,
+ do.print = FALSE)
> pca.plot(k562_grch37.seurat, pt.size = 2)
> h1.seurat = pca(h1.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1.seurat, pt.size = 2)
> h2.seurat = pca(h2.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2.seurat, pt.size = 2)
> h3.seurat = pca(h3.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3.seurat, pt.size = 2)
> h1_refseq.seurat = pca(h1_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1_refseq.seurat, pt.size = 2)
> h2_refseq.seurat = pca(h2_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2_refseq.seurat, pt.size = 2)
> h3_refseq.seurat = pca(h3_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3_refseq.seurat, pt.size = 2)
We can see the top 200 expressed genes have very variable expression across the cells. There are many cells where there is no expression; the K562 cells don’t have this issue.
> top_200_heatmap = function(counts) {
+ counts = as.matrix(counts)
+ counts = head(counts[order(rowSums(counts), decreasing = TRUE), ], 200)
+ pheatmap(log(counts + 1), show_rownames = FALSE, show_colnames = FALSE)
+ }
> top_200_heatmap(k562_grch37)
> top_200_heatmap(k562_refseq)
> top_200_heatmap(h1)
> top_200_heatmap(h2)
> top_200_heatmap(h3)
> top_200_heatmap(h1_refseq)
> top_200_heatmap(h2_refseq)
> top_200_heatmap(h3_refseq)
I pulled a set of housekeeping genes from Evidence Based Selection of Housekeeping Genes and converted them to human and mouse symbols. Then I made heatmaps for all of the housekeeping genes. We can see that for the K562 sample, most of the housekeeping genes are expressed strongly whereas in the H1-H3 samples, there is spare expression of the housekeeping genes.
> hk = as.character(read.table("metadata/hk.txt", header = FALSE)[, "V1"])
> hk = c(hk, c("Actb", "Gapdh"))
> human_hk = as.character(read.table("metadata/human-hk.txt", header = FALSE)[,
+ "V1"])
> human_hk = c(human_hk, c("Actb", "Gapdh"))
> housekeeping_heatmap = function(counts, hk) {
+ counts = counts[rownames(counts) %in% hk, ]
+ pheatmap(log(counts + 1), show_rownames = TRUE, show_colnames = FALSE)
+ }
> housekeeping_heatmap(k562_grch37, human_hk)
> housekeeping_heatmap(k562_refseq, human_hk)
> housekeeping_heatmap(h1, hk)
> housekeeping_heatmap(h2, hk)
> housekeeping_heatmap(h3, hk)
> housekeeping_heatmap(h1_refseq, hk)
> housekeeping_heatmap(h2_refseq, hk)
> housekeeping_heatmap(h3_refseq, hk)
It is hard to say what might be the cause of the problems we identified. If we could see the characteristic dip in the barcode distribution like we did for K562, I would feel pretty confident in recomending sequencing these samples some more. Since we do not see that dip, I’m not sure what to think.
I subsampled the K562 data and reran it, the distribution looks exactly the same and has the dip, so it isn’t due to the depth of sequencing.
This is my interpretation of Allon’s barcode plots. The matlab command:
[fLog xLog] = hist(log10(counts(counts>0)),50);
y = fLog.*(10.^xLog)/sum(fLog.*(10.^xLog));
plot(xLog,y*100,'linewidth',2)
hist from matlab returns a tuple of counts and centers. R returns a dataframe with mids and counts as columns. So, the R version of this plot is:
> allon_barcode_plot = function(bcs) {
+ bcs_hist = hist(log10(bcs$count), plot = FALSE, n = 50)
+ fLog = bcs_hist$count
+ xLog = bcs_hist$mids
+ y = fLog * (10^xLog)/sum(fLog * (10^xLog))
+ qplot(xLog, y) + geom_point() + theme_bw()
+ }
> allon_barcode_plot(K562_small)
> allon_barcode_plot(h1_bc)
> allon_barcode_plot(h2_bc)
> allon_barcode_plot(h3_bc)
```